{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Collection of Helpful Functions for [Class](https://sites.wustl.edu/jeffheaton/t81-558/)\n",
    "\n",
    "This is a collection of helpful functions that I will introduce during this course. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import base64\n",
    "import os\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import requests\n",
    "from sklearn import preprocessing\n",
    "\n",
    "\n",
    "# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)\n",
    "def encode_text_dummy(df, name):\n",
    "    dummies = pd.get_dummies(df[name])\n",
    "    for x in dummies.columns:\n",
    "        dummy_name = f\"{name}-{x}\"\n",
    "        df[dummy_name] = dummies[x]\n",
    "    df.drop(name, axis=1, inplace=True)\n",
    "\n",
    "\n",
    "# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1\n",
    "# at every location where the original column (name) matches each of the target_values.  One column is added for\n",
    "# each target value.\n",
    "def encode_text_single_dummy(df, name, target_values):\n",
    "    for tv in target_values:\n",
    "        l = list(df[name].astype(str))\n",
    "        l = [1 if str(x) == str(tv) else 0 for x in l]\n",
    "        name2 = f\"{name}-{tv}\"\n",
    "        df[name2] = l\n",
    "\n",
    "\n",
    "# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).\n",
    "def encode_text_index(df, name):\n",
    "    le = preprocessing.LabelEncoder()\n",
    "    df[name] = le.fit_transform(df[name])\n",
    "    return le.classes_\n",
    "\n",
    "\n",
    "# Encode a numeric column as zscores\n",
    "def encode_numeric_zscore(df, name, mean=None, sd=None):\n",
    "    if mean is None:\n",
    "        mean = df[name].mean()\n",
    "\n",
    "    if sd is None:\n",
    "        sd = df[name].std()\n",
    "\n",
    "    df[name] = (df[name] - mean) / sd\n",
    "\n",
    "\n",
    "# Convert all missing values in the specified column to the median\n",
    "def missing_median(df, name):\n",
    "    med = df[name].median()\n",
    "    df[name] = df[name].fillna(med)\n",
    "\n",
    "\n",
    "# Convert all missing values in the specified column to the default\n",
    "def missing_default(df, name, default_value):\n",
    "    df[name] = df[name].fillna(default_value)\n",
    "\n",
    "\n",
    "# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs\n",
    "def to_xy(df, target):\n",
    "    result = []\n",
    "    for x in df.columns:\n",
    "        if x != target:\n",
    "            result.append(x)\n",
    "    # find out the type of the target column.  Is it really this hard? :(\n",
    "    target_type = df[target].dtypes\n",
    "    target_type = target_type[0] if hasattr(\n",
    "        target_type, '__iter__') else target_type\n",
    "    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.\n",
    "    if target_type in (np.int64, np.int32):\n",
    "        # Classification\n",
    "        dummies = pd.get_dummies(df[target])\n",
    "        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)\n",
    "    # Regression\n",
    "    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)\n",
    "\n",
    "# Nicely formatted time string\n",
    "\n",
    "\n",
    "def hms_string(sec_elapsed):\n",
    "    h = int(sec_elapsed / (60 * 60))\n",
    "    m = int((sec_elapsed % (60 * 60)) / 60)\n",
    "    s = sec_elapsed % 60\n",
    "    return f\"{h}:{m:>02}:{s:>05.2f}\"\n",
    "\n",
    "\n",
    "# Regression chart.\n",
    "def chart_regression(pred, y, sort=True):\n",
    "    t = pd.DataFrame({'pred': pred, 'y': y.flatten()})\n",
    "    if sort:\n",
    "        t.sort_values(by=['y'], inplace=True)\n",
    "    plt.plot(t['y'].tolist(), label='expected')\n",
    "    plt.plot(t['pred'].tolist(), label='prediction')\n",
    "    plt.ylabel('output')\n",
    "    plt.legend()\n",
    "    plt.show()\n",
    "\n",
    "# Remove all rows where the specified column is +/- sd standard deviations\n",
    "\n",
    "\n",
    "def remove_outliers(df, name, sd):\n",
    "    drop_rows = df.index[(np.abs(df[name] - df[name].mean())\n",
    "                          >= (sd * df[name].std()))]\n",
    "    df.drop(drop_rows, axis=0, inplace=True)\n",
    "\n",
    "\n",
    "# Encode a column to a range between normalized_low and normalized_high.\n",
    "def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,\n",
    "                         data_low=None, data_high=None):\n",
    "    if data_low is None:\n",
    "        data_low = min(df[name])\n",
    "        data_high = max(df[name])\n",
    "\n",
    "    df[name] = ((df[name] - data_low) / (data_high - data_low)) \\\n",
    "        * (normalized_high - normalized_low) + normalized_low\n"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3.9 (tensorflow)",
   "language": "python",
   "name": "tensorflow"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
